#LOADING THE DATASET
library(wooldridge)
data2<-happiness
data2
str(data2)
## 'data.frame': 17137 obs. of 33 variables:
## $ year : int 1994 1994 1994 1994 1994 1994 1994 1994 1994 1994 ...
## $ workstat : Factor w/ 10 levels "iap","working fulltime",..: 8 2 2 2 3 3 2 2 2 2 ...
## $ prestige : int 46 22 29 42 36 43 20 44 42 46 ...
## $ divorce : Factor w/ 5 levels "iap","yes","no",..: NA 3 3 2 NA NA 3 NA 3 NA ...
## $ widowed : Factor w/ 5 levels "iap","yes","no",..: 1 1 1 1 NA NA NA NA NA 1 ...
## $ educ : int 12 12 12 8 13 15 9 12 12 12 ...
## $ reg16 : Factor w/ 10 levels "foreign","new england",..: 3 1 1 1 3 3 3 1 1 3 ...
## $ babies : int 2 0 0 0 0 0 0 0 2 0 ...
## $ preteen : int 3 0 0 0 1 0 1 0 1 0 ...
## $ teens : int 0 0 0 0 1 0 1 0 0 1 ...
## $ income : Factor w/ 16 levels "iap","lt $1000",..: 10 NA 11 11 10 13 12 10 13 11 ...
## $ region : Factor w/ 10 levels "not assigned",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ attend : Factor w/ 10 levels "never","lt once a year",..: 4 8 9 3 3 4 9 9 4 9 ...
## $ happy : Factor w/ 6 levels "iap","very happy",..: 3 2 3 4 4 3 4 2 3 3 ...
## $ owngun : Factor w/ 6 levels "iap","yes","no",..: NA NA 1 NA 1 1 1 NA 1 1 ...
## $ tvhours : int 2 3 1 3 NA 0 10 4 2 NA ...
## $ vhappy : int 0 1 0 0 0 0 0 1 0 0 ...
## $ mothfath16 : int 1 0 0 0 0 0 1 1 0 1 ...
## $ black : int 1 1 1 0 1 1 0 1 1 1 ...
## $ gwbush04 : int NA NA NA NA NA NA NA NA NA NA ...
## $ female : int 1 0 1 0 1 1 1 0 1 1 ...
## $ blackfemale: int 1 0 1 0 1 1 0 0 1 1 ...
## $ gwbush00 : int NA NA NA NA NA NA NA NA NA NA ...
## $ occattend : int 1 0 0 0 0 1 0 0 1 0 ...
## $ regattend : int 0 0 1 0 0 0 1 1 0 1 ...
## $ y94 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ y96 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ y98 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ y00 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ y02 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ y04 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ y06 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ unem10 : int 1 0 NA 1 0 NA NA 0 NA 1 ...
## - attr(*, "time.stamp")= chr "22 Jan 2013 14:09"
## - attr(*, "label.table")=List of 15
## ..$ wrkstat : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr [1:10] "iap" "working fulltime" "working parttime" "temp not working" ...
## ..$ prestg80: Named num 0
## .. ..- attr(*, "names")= chr "dk,na,iap"
## ..$ divorce : Named num [1:5] 0 1 2 8 9
## .. ..- attr(*, "names")= chr [1:5] "iap" "yes" "no" "dk" ...
## ..$ widowed : Named num [1:5] 0 1 2 8 9
## .. ..- attr(*, "names")= chr [1:5] "iap" "yes" "no" "dk" ...
## ..$ educ : Named num [1:3] 97 98 99
## .. ..- attr(*, "names")= chr [1:3] "iap" "dk" "na"
## ..$ reg16 : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr [1:10] "foreign" "new england" "middle atlantic" "e. nor. central" ...
## ..$ babies : Named num [1:2] 8 9
## .. ..- attr(*, "names")= chr [1:2] "8 or more" "na"
## ..$ preteen : Named num [1:2] 8 9
## .. ..- attr(*, "names")= chr [1:2] "8 or more" "na"
## ..$ teens : Named num [1:2] 8 9
## .. ..- attr(*, "names")= chr [1:2] "8 or more" "na"
## ..$ income : Named num [1:16] 0 1 2 3 4 5 6 7 8 9 ...
## .. ..- attr(*, "names")= chr [1:16] "iap" "lt $1000" "$1000 to 2999" "$3000 to 3999" ...
## ..$ region : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr [1:10] "not assigned" "new england" "middle atlantic" "e. nor. central" ...
## ..$ attend : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr [1:10] "never" "lt once a year" "once a year" "sevrl times a yr" ...
## ..$ happy : Named num [1:6] 0 1 2 3 8 9
## .. ..- attr(*, "names")= chr [1:6] "iap" "very happy" "pretty happy" "not too happy" ...
## ..$ owngun : Named num [1:6] 0 1 2 3 8 9
## .. ..- attr(*, "names")= chr [1:6] "iap" "yes" "no" "refused" ...
## ..$ tvhours : Named num [1:3] -1 98 99
## .. ..- attr(*, "names")= chr [1:3] "iap" "dk" "na"
The given dataset has 17137 observations and 33 variables
summary(data2)
## year workstat prestige divorce
## Min. :1994 working fulltime:9214 Min. :17.00 iap : 0
## 1st Qu.:1996 retired :2459 1st Qu.:33.00 yes :2333
## Median :1998 keeping house :1912 Median :43.00 no :7421
## Mean :1999 working parttime:1812 Mean :43.88 dk : 0
## 3rd Qu.:2004 school : 521 3rd Qu.:51.00 na : 0
## Max. :2006 (Other) :1216 Max. :86.00 NA's:7383
## NA's : 3 NA's :854
## widowed educ reg16 babies
## iap :10718 Min. : 0.00 e. nor. central:3148 Min. :0.0000
## yes : 378 1st Qu.:12.00 south atlantic :2643 1st Qu.:0.0000
## no : 0 Median :13.00 middle atlantic:2620 Median :0.0000
## dk : 0 Mean :13.32 pacific :1738 Mean :0.2071
## na : 0 3rd Qu.:16.00 w. sou. central:1635 3rd Qu.:0.0000
## NA's: 6041 Max. :20.00 w. nor. central:1294 Max. :6.0000
## NA's :44 (Other) :4059 NA's :101
## preteen teens income region
## Min. :0.0000 Min. :0.0000 $25000 or more:9725 south atlantic :3340
## 1st Qu.:0.0000 1st Qu.:0.0000 $20000 - 24999:1278 e. nor. central:2881
## Median :0.0000 Median :0.0000 $10000 - 14999:1251 middle atlantic:2414
## Mean :0.2557 Mean :0.1814 $15000 - 19999:1099 pacific :2353
## 3rd Qu.:0.0000 3rd Qu.:0.0000 $8000 to 9999 : 399 w. sou. central:1782
## Max. :6.0000 Max. :7.0000 (Other) :1293 w. nor. central:1247
## NA's :101 NA's :88 NA's :2092 (Other) :3120
## attend happy owngun tvhours
## never :3213 iap : 0 iap :7180 Min. : 0.000
## every week :3041 very happy :5260 yes :4117 1st Qu.: 1.000
## once a year :2209 pretty happy :9791 no : 0 Median : 2.000
## sevrl times a yr:2118 not too happy:2086 refused: 0 Mean : 2.904
## 2-3x a month :1495 dk : 0 dk : 0 3rd Qu.: 4.000
## (Other) :4788 na : 0 na : 0 Max. :24.000
## NA's : 273 NA's :5840 NA's :5343
## vhappy mothfath16 black gwbush04
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :1.000 Median :0.0000 Median :1.000
## Mean :0.3069 Mean :0.693 Mean :0.1384 Mean :0.502
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.000
## NA's :5 NA's :15207
## female blackfemale gwbush00 occattend
## Min. :0.0000 Min. :0.00000 Min. :0.000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.000
## Median :1.0000 Median :0.00000 Median :1.000 Median :0.000
## Mean :0.5591 Mean :0.08905 Mean :0.522 Mean :0.285
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.00000 Max. :1.000 Max. :1.000
## NA's :13701 NA's :273
## regattend y94 y96 y98
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1312 Mean :0.1737 Mean :0.1683 Mean :0.1637
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :273
## y00 y02 y04 y06
## Min. :0.000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.162 Mean :0.07989 Mean :0.07802 Mean :0.1742
## 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.00000 Max. :1.00000 Max. :1.0000
##
## unem10
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.318
## 3rd Qu.:1.000
## Max. :1.000
## NA's :5796
colSums(is.na(data2))
## year workstat prestige divorce widowed educ
## 0 3 854 7383 6041 44
## reg16 babies preteen teens income region
## 0 101 101 88 2092 0
## attend happy owngun tvhours vhappy mothfath16
## 273 0 5840 5343 0 5
## black gwbush04 female blackfemale gwbush00 occattend
## 0 15207 0 0 13701 273
## regattend y94 y96 y98 y00 y02
## 273 0 0 0 0 0
## y04 y06 unem10
## 0 0 5796
Here the rows with missing values with gwbush04 having the highest number of missing values whereas workstat has the least number of missing values with only 3
colMeans(is.na(data2))
## year workstat prestige divorce widowed educ
## 0.0000000000 0.0001750598 0.0498336932 0.4308221976 0.3525121083 0.0025675439
## reg16 babies preteen teens income region
## 0.0000000000 0.0058936803 0.0058936803 0.0051350878 0.1220750423 0.0000000000
## attend happy owngun tvhours vhappy mothfath16
## 0.0159304429 0.0000000000 0.3407831009 0.3117815254 0.0000000000 0.0002917664
## black gwbush04 female blackfemale gwbush00 occattend
## 0.0000000000 0.8873781875 0.0000000000 0.0000000000 0.7994981619 0.0159304429
## regattend y94 y96 y98 y00 y02
## 0.0159304429 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## y04 y06 unem10
## 0.0000000000 0.0000000000 0.3382155570
From the above data gwbush04 has the highest percentage of rows with missing value which is around ~80% and workstat with the least percentage.
data2$prestige[is.na(data2$prestige)] <- round(mean(data2$prestige, na.rm = TRUE))
data2$educ[is.na(data2$educ)] <- round(mean(data2$educ, na.rm = TRUE))
data2$educ[is.na(data2$educ)] <- round(mean(data2$educ, na.rm = TRUE))
data2$babies[is.na(data2$babies)] <- round(mean(data2$babies, na.rm = TRUE))
data2$preteen[is.na(data2$preteen)] <- round(mean(data2$preteen, na.rm = TRUE))
data2$teens[is.na(data2$teens)] <- round(mean(data2$teens, na.rm = TRUE))
data2$tvhours[is.na(data2$tvhours)] <- round(mean(data2$tvhours, na.rm = TRUE))
data2$mothfath16[is.na(data2$mothfath16)] <- round(mean(data2$mothfath16, na.rm = TRUE))
data2$gwbush04[is.na(data2$gwbush04)] <- round(mean(data2$gwbush04, na.rm = TRUE))
data2$gwbush00[is.na(data2$gwbush00)] <- round(mean(data2$gwbush00, na.rm = TRUE))
data2$occattend[is.na(data2$occattend)] <- round(mean(data2$occattend, na.rm = TRUE))
data2$regattend[is.na(data2$regattend)] <- round(mean(data2$regattend, na.rm = TRUE))
data2$unem10[is.na(data2$unem10)] <- round(mean(data2$unem10, na.rm = TRUE))
From the above data we are replacing the missing values with mean of columns for numeric attributes
Modes <- function(x) {
ux <- na.omit(unique(x))
tab <- tabulate(match(x, ux));ux[tab == max(tab)]
}
data2$workstat[is.na(data2$workstat)]<- Modes(data2$workstat)
data2$divorce[is.na(data2$divorce)]<- Modes(data2$divorce)
data2$widowed[is.na(data2$widowed)]<- Modes(data2$widowed)
data2$income[is.na(data2$income)]<- Modes(data2$income)
data2$attend[is.na(data2$attend)]<- Modes(data2$attend)
data2$owngun[is.na(data2$owngun)]<- Modes(data2$owngun)
The above variables are categorical so we are replacing the missing values with mode of the column
colSums(is.na(data2))
## year workstat prestige divorce widowed educ
## 0 0 0 0 0 0
## reg16 babies preteen teens income region
## 0 0 0 0 0 0
## attend happy owngun tvhours vhappy mothfath16
## 0 0 0 0 0 0
## black gwbush04 female blackfemale gwbush00 occattend
## 0 0 0 0 0 0
## regattend y94 y96 y98 y00 y02
## 0 0 0 0 0 0
## y04 y06 unem10
## 0 0 0
After replacing with mean and mode we can see that there are no more missing values in the data
data2$vhappy<-as.factor(data2$vhappy)
data2$mothfath16<-as.factor(data2$mothfath16)
data2$black<-as.factor(data2$black)
data2$gwbush04<-as.factor(data2$gwbush04)
data2$female<-as.factor(data2$female)
data2$blackfemale<-as.factor(data2$blackfemale)
data2$gwbush00<-as.factor(data2$gwbush00)
data2$occattend<-as.factor(data2$occattend)
data2$regattend<-as.factor(data2$regattend)
data2$y94<-as.factor(data2$y94)
data2$y96<-as.factor(data2$y96)
data2$y98<-as.factor(data2$y98)
data2$y00<-as.factor(data2$y00)
data2$y02<-as.factor(data2$y02)
data2$y04<-as.factor(data2$y04)
data2$y06<-as.factor(data2$y06)
data2$unem10<-as.factor(data2$unem10)
Few of the integer variables in the dataset are being converted to factor variables
str(data2)
## 'data.frame': 17137 obs. of 33 variables:
## $ year : int 1994 1994 1994 1994 1994 1994 1994 1994 1994 1994 ...
## $ workstat : Factor w/ 10 levels "iap","working fulltime",..: 8 2 2 2 3 3 2 2 2 2 ...
## $ prestige : num 46 22 29 42 36 43 20 44 42 46 ...
## $ divorce : Factor w/ 5 levels "iap","yes","no",..: 3 3 3 2 3 3 3 3 3 3 ...
## $ widowed : Factor w/ 5 levels "iap","yes","no",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ educ : num 12 12 12 8 13 15 9 12 12 12 ...
## $ reg16 : Factor w/ 10 levels "foreign","new england",..: 3 1 1 1 3 3 3 1 1 3 ...
## $ babies : num 2 0 0 0 0 0 0 0 2 0 ...
## $ preteen : num 3 0 0 0 1 0 1 0 1 0 ...
## $ teens : num 0 0 0 0 1 0 1 0 0 1 ...
## $ income : Factor w/ 16 levels "iap","lt $1000",..: 10 13 11 11 10 13 12 10 13 11 ...
## $ region : Factor w/ 10 levels "not assigned",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ attend : Factor w/ 10 levels "never","lt once a year",..: 4 8 9 3 3 4 9 9 4 9 ...
## $ happy : Factor w/ 6 levels "iap","very happy",..: 3 2 3 4 4 3 4 2 3 3 ...
## $ owngun : Factor w/ 6 levels "iap","yes","no",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ tvhours : num 2 3 1 3 3 0 10 4 2 3 ...
## $ vhappy : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 2 1 1 ...
## $ mothfath16 : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 2 1 2 ...
## $ black : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 1 2 2 2 ...
## $ gwbush04 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ female : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 1 2 2 ...
## $ blackfemale: Factor w/ 2 levels "0","1": 2 1 2 1 2 2 1 1 2 2 ...
## $ gwbush00 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ occattend : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 1 2 1 ...
## $ regattend : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 2 2 1 2 ...
## $ y94 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ y96 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ y98 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ y00 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ y02 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ y04 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ y06 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ unem10 : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 1 1 2 ...
## - attr(*, "time.stamp")= chr "22 Jan 2013 14:09"
## - attr(*, "label.table")=List of 15
## ..$ wrkstat : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr [1:10] "iap" "working fulltime" "working parttime" "temp not working" ...
## ..$ prestg80: Named num 0
## .. ..- attr(*, "names")= chr "dk,na,iap"
## ..$ divorce : Named num [1:5] 0 1 2 8 9
## .. ..- attr(*, "names")= chr [1:5] "iap" "yes" "no" "dk" ...
## ..$ widowed : Named num [1:5] 0 1 2 8 9
## .. ..- attr(*, "names")= chr [1:5] "iap" "yes" "no" "dk" ...
## ..$ educ : Named num [1:3] 97 98 99
## .. ..- attr(*, "names")= chr [1:3] "iap" "dk" "na"
## ..$ reg16 : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr [1:10] "foreign" "new england" "middle atlantic" "e. nor. central" ...
## ..$ babies : Named num [1:2] 8 9
## .. ..- attr(*, "names")= chr [1:2] "8 or more" "na"
## ..$ preteen : Named num [1:2] 8 9
## .. ..- attr(*, "names")= chr [1:2] "8 or more" "na"
## ..$ teens : Named num [1:2] 8 9
## .. ..- attr(*, "names")= chr [1:2] "8 or more" "na"
## ..$ income : Named num [1:16] 0 1 2 3 4 5 6 7 8 9 ...
## .. ..- attr(*, "names")= chr [1:16] "iap" "lt $1000" "$1000 to 2999" "$3000 to 3999" ...
## ..$ region : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr [1:10] "not assigned" "new england" "middle atlantic" "e. nor. central" ...
## ..$ attend : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr [1:10] "never" "lt once a year" "once a year" "sevrl times a yr" ...
## ..$ happy : Named num [1:6] 0 1 2 3 8 9
## .. ..- attr(*, "names")= chr [1:6] "iap" "very happy" "pretty happy" "not too happy" ...
## ..$ owngun : Named num [1:6] 0 1 2 3 8 9
## .. ..- attr(*, "names")= chr [1:6] "iap" "yes" "no" "refused" ...
## ..$ tvhours : Named num [1:3] -1 98 99
## .. ..- attr(*, "names")= chr [1:3] "iap" "dk" "na"
data2$happy<-droplevels(data2$happy)
Here er have dropped the unused levels from the happy variable which are DK,idap and na
library(ggplot2)
ggplot(data2, aes(x=happy, fill=workstat))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=divorce))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=widowed))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=reg16))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=income))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=region))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=attend))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=owngun))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=vhappy))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=mothfath16))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=black))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=gwbush04))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=female))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=blackfemale))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=gwbush00))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=occattend))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=regattend))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=y94))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=y98))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=y00))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=y02))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=y04))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=y06))+geom_bar(position = "dodge")
ggplot(data2, aes(x=happy, fill=unem10))+geom_bar(position = "dodge")
Barplots are used to visualize the relationship between factor variables
boxplot(data2$prestige~data2$happy)
boxplot(data2$educ~data2$happy)
boxplot(data2$babies~data2$happy)
boxplot(data2$preteen~data2$happy)
boxplot(data2$teens~data2$happy)
boxplot(data2$tvhours~data2$happy)
Boxplots are used to establish the relationship between factor and
numeric variables
levels(data2$happy)[levels(data2$happy)=="pretty happy"]<-"very happy"
Here we have reduced the levels of happy variable from three to two as “pretty happy” and “very happy” are quite similar, so we merged them into one and with two levels it also hepls us to find percision, recall
set.seed(123)
library(lattice)
library(caret)
train.index=createDataPartition(data2$happy, p=0.7, list = FALSE)
data2_train<-data2[train.index, ]
data2_test <-data2[-train.index, ]
data2_train_labels = data2[train.index, 14]
data2_test_labels = data2[-train.index,14]
We have split the data to 70% training and 30% testing
#LOGISTIC REGRESSION
set.seed(1)
data2_logistic<-glm(happy~., data=data2_train, family = "binomial")
summary(data2_logistic)
##
## Call:
## glm(formula = happy ~ ., family = "binomial", data = data2_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5935 -0.5750 -0.4185 0.0000 2.4590
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.295e+01 1.357e+02 0.464 0.642855
## year -3.136e-02 6.767e-02 -0.463 0.643078
## workstatworking parttime 5.449e-02 1.030e-01 0.529 0.596799
## workstattemp not working 6.287e-01 1.884e-01 3.337 0.000846 ***
## workstatunempl, laid off 7.380e-01 1.388e-01 5.316 1.06e-07 ***
## workstatretired 3.715e-01 9.859e-02 3.768 0.000164 ***
## workstatschool 1.357e-01 1.693e-01 0.801 0.422992
## workstatkeeping house 2.261e-01 1.061e-01 2.131 0.033116 *
## workstatother 9.292e-01 1.569e-01 5.923 3.16e-09 ***
## prestige -3.147e-03 2.711e-03 -1.161 0.245838
## divorceno 2.113e-01 9.960e-02 2.121 0.033884 *
## widowedyes 2.188e-01 1.938e-01 1.129 0.258885
## educ -3.478e-02 1.205e-02 -2.888 0.003883 **
## reg16new england -6.236e-01 2.305e-01 -2.705 0.006828 **
## reg16middle atlantic -4.396e-01 1.407e-01 -3.123 0.001788 **
## reg16e. nor. central -5.076e-01 1.545e-01 -3.286 0.001015 **
## reg16w. nor. central -6.049e-01 2.001e-01 -3.022 0.002508 **
## reg16south atlantic -4.846e-01 1.457e-01 -3.326 0.000882 ***
## reg16e. sou. central -1.145e-01 2.081e-01 -0.550 0.582204
## reg16w. sou. central -6.242e-01 1.868e-01 -3.342 0.000833 ***
## reg16mountain -5.843e-01 2.060e-01 -2.837 0.004557 **
## reg16pacific -4.731e-01 1.569e-01 -3.016 0.002563 **
## babies -1.139e-01 6.006e-02 -1.897 0.057863 .
## preteen 3.780e-02 4.990e-02 0.757 0.448774
## teens -4.670e-02 6.324e-02 -0.738 0.460223
## income$1000 to 2999 -2.743e-01 3.181e-01 -0.863 0.388410
## income$3000 to 3999 -3.718e-01 3.238e-01 -1.148 0.250905
## income$4000 to 4999 -5.608e-01 3.386e-01 -1.656 0.097672 .
## income$5000 to 5999 -8.615e-01 3.353e-01 -2.569 0.010186 *
## income$6000 to 6999 -3.320e-01 3.038e-01 -1.093 0.274555
## income$7000 to 7999 -2.988e-01 2.993e-01 -0.998 0.318228
## income$8000 to 9999 -1.862e-01 2.715e-01 -0.686 0.492697
## income$10000 - 14999 -6.009e-01 2.494e-01 -2.410 0.015957 *
## income$15000 - 19999 -7.859e-01 2.561e-01 -3.068 0.002153 **
## income$20000 - 24999 -6.312e-01 2.531e-01 -2.494 0.012642 *
## income$25000 or more -9.889e-01 2.385e-01 -4.146 3.39e-05 ***
## regionmiddle atlantic 2.654e-02 2.276e-01 0.117 0.907178
## regione. nor. central -8.047e-02 2.370e-01 -0.340 0.734226
## regionw. nor. central -2.073e-01 2.674e-01 -0.775 0.438149
## regionsouth atlantic -5.045e-02 2.224e-01 -0.227 0.820510
## regione. sou. central -4.419e-01 2.821e-01 -1.567 0.117230
## regionw. sou. central 3.379e-02 2.553e-01 0.132 0.894701
## regionmountain -1.111e-01 2.555e-01 -0.435 0.663822
## regionpacific 2.406e-02 2.351e-01 0.102 0.918468
## attendlt once a year -1.480e-01 1.149e-01 -1.288 0.197893
## attendonce a year -2.368e-01 1.021e-01 -2.319 0.020389 *
## attendsevrl times a yr -2.016e-01 1.036e-01 -1.946 0.051695 .
## attendonce a month -3.275e-01 1.315e-01 -2.490 0.012759 *
## attend2-3x a month -3.247e-01 1.238e-01 -2.623 0.008704 **
## attendnrly every week -4.999e-01 1.529e-01 -3.270 0.001075 **
## attendevery week -4.444e-01 1.026e-01 -4.331 1.48e-05 ***
## attendmore thn once wk -2.028e-01 1.367e-01 -1.483 0.137999
## owngunyes -1.551e-01 8.067e-02 -1.923 0.054492 .
## tvhours 2.632e-02 1.422e-02 1.851 0.064180 .
## vhappy1 -2.350e+01 2.954e+03 -0.008 0.993653
## mothfath161 -7.856e-02 6.469e-02 -1.214 0.224594
## black1 3.407e-01 1.322e-01 2.576 0.009985 **
## gwbush041 1.356e-01 1.743e-01 0.778 0.436467
## female1 7.882e-02 7.080e-02 1.113 0.265597
## blackfemale1 3.247e-02 1.605e-01 0.202 0.839710
## gwbush001 -4.682e-02 1.301e-01 -0.360 0.718895
## occattend1 NA NA NA NA
## regattend1 NA NA NA NA
## y941 -5.385e-01 7.681e-01 -0.701 0.483241
## y961 -4.246e-01 6.339e-01 -0.670 0.503049
## y981 -3.444e-01 5.012e-01 -0.687 0.492005
## y001 -5.106e-01 3.708e-01 -1.377 0.168503
## y021 -4.190e-01 2.476e-01 -1.692 0.090599 .
## y041 NA NA NA NA
## y061 NA NA NA NA
## unem101 4.560e-01 6.886e-02 6.622 3.55e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8888.8 on 11996 degrees of freedom
## Residual deviance: 7230.3 on 11930 degrees of freedom
## AIC: 7364.3
##
## Number of Fisher Scoring iterations: 24
data2_pred<-predict(data2_logistic, data2_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
data2_new_label=factor(ifelse(data2_pred>0.5, "1","0"))
set.seed(1)
actuallabel<-data2_test$happy
t=table(data2_new_label,actuallabel)
t
## actuallabel
## data2_new_label very happy not too happy
## 0 4501 609
## 1 14 16
From the table we will get the true positive value, true negative, false positive and false negative
TP<-4501
FN<-14
FP<-609
TN<-16
accuracy_logistic<-(TP+TN)/(TP+TN+FP+FN)
accuracy_logistic
## [1] 0.8787938
The accuracy is 0.8787938
percision_logistic<-TP/(TP+FP)
percision_logistic
## [1] 0.8808219
The percision is 0.8808219
recall_logistic<-TP/(TP+FN)
recall_logistic
## [1] 0.9968992
The recall is 0.9968992
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
auc(data2_test$happy, data2_pred)
## Setting levels: control = very happy, case = not too happy
## Setting direction: controls < cases
## Area under the curve: 0.7807
AUC is 0.7807
#RANDOM FOREST
set.seed(1)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
data2_rf <- randomForest(happy ~ ., data = data2_train)
data2_rf
##
## Call:
## randomForest(formula = happy ~ ., data = data2_train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 12.23%
## Confusion matrix:
## very happy not too happy class.error
## very happy 10514 22 0.002088079
## not too happy 1445 16 0.989048597
set.seed(1)
data2_rf_predictions=predict(data2_rf, data2_test)
table(data2_rf_predictions,data2_test$happy)
##
## data2_rf_predictions very happy not too happy
## very happy 4508 623
## not too happy 7 2
TP<-4508
FN<-7
FP<-623
TN<-2
accuracy_random<-(TP+TN)/(TP+TN+FP+FN)
accuracy_random
## [1] 0.8774319
The accuracy is 0.8774319
percision_random<-TP/(TP+FP)
percision_random
## [1] 0.8785812
The percision is 0.8785812
recall_random<-TP/(TP+FN)
recall_random
## [1] 0.9984496
The recall is 0.9984496
library(pROC)
auc(data2_test$happy, as.numeric(data2_rf_predictions))
## Setting levels: control = very happy, case = not too happy
## Setting direction: controls < cases
## Area under the curve: 0.5008
The AUC is 0.5008
#SVM
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
data2_svm <- ksvm(happy ~ ., data = data2_train, kernel = "vanilladot")
## Setting default kernel parameters
data2_svm
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 4193
##
## Objective Function Value : -2922
## Training error : 0.12178
svm_pred<-predict(data2_svm,data2_test)
table(svm_pred, data2_test$happy)
##
## svm_pred very happy not too happy
## very happy 4515 625
## not too happy 0 0
TP<-4515
FN<-0
FP<-625
TN<-0
accuracy_svm<-(TP+TN)/(TP+TN+FP+FN)
accuracy_svm
## [1] 0.8784047
The accuracy is 0.8784047
percision_svm<-TP/(TP+FP)
percision_svm
## [1] 0.8784047
The percision is 0.8784047
recall_svm<-TP/(TP+FN)
recall_svm
## [1] 1
The recall is 1
auc(data2_test$happy, as.numeric(svm_pred))
## Setting levels: control = very happy, case = not too happy
## Setting direction: controls < cases
## Area under the curve: 0.5
The AUC is 0.5